library(tidyverse) # includes ggplot2
library(skimr) # provides a compact and informative summary of your data frame or dataset
library(lubridate)
library(janitor) # set of utility functions for data cleaning and data frame tidying tasks
library(RColorBrewer) # Color palettes for data visualization
library(ggcorrplot) # Visualize correlation matrices using ggplot2
library(scales) # formatting and transforming data for visualizations
# display.brewer.all(colorblindFriendly = TRUE)
# Clean environment
rm(list = ls())
daily_activity <-
read_csv("original_data/dailyActivity_merged.csv",
trim_ws = TRUE,
show_col_types = FALSE
)
daily_sleep <- read_csv("original_data/sleepDay_merged.csv",
trim_ws = TRUE,
show_col_types = FALSE
)
hourly_calories <-
read_csv("original_data/hourlyCalories_merged.csv",
trim_ws = TRUE,
show_col_types = FALSE
)
hourly_intensities <-
read_csv("original_data/hourlyIntensities_merged.csv",
trim_ws = TRUE,
show_col_types = FALSE
)
hourly_steps <-
read_csv("original_data/hourlySteps_merged.csv",
trim_ws = TRUE,
show_col_types = FALSE
)
minute_sleep <-
read_csv("original_data/minuteSleep_merged.csv",
trim_ws = TRUE,
show_col_types = FALSE
)
weight_logs <-
read_csv("original_data/weightLogInfo_merged.csv",
trim_ws = TRUE,
show_col_types = FALSE
)
seconds_heartrate <-
read_csv("original_data/heartrate_seconds_merged.csv",
trim_ws = TRUE,
show_col_types = FALSE
)
# Remove trailing spaces (trim_ws = TRUE)
# Check daily_activity data set before cleaning
glimpse(daily_activity)
Rows: 940
Columns: 15
$ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 15039603…
$ ActivityDate <chr> "4/12/2016", "4/13/2016", "4/14/2016", "4/15/2016", "4/16/2016", "4/17/2016", "4/18/2016", "…
$ TotalSteps <dbl> 13162, 10735, 10460, 9762, 12669, 9705, 13019, 15506, 10544, 9819, 12764, 14371, 10039, 1535…
$ TotalDistance <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.88, 6.68, 6.34, 8.13, 9.04, 6.41, 9.80, 8.79, 12…
$ TrackerDistance <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.88, 6.68, 6.34, 8.13, 9.04, 6.41, 9.80, 8.79, 12…
$ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ VeryActiveDistance <dbl> 1.88, 1.57, 2.44, 2.14, 2.71, 3.19, 3.25, 3.53, 1.96, 1.34, 4.76, 2.81, 2.92, 5.29, 2.33, 6.…
$ ModeratelyActiveDistance <dbl> 0.55, 0.69, 0.40, 1.26, 0.41, 0.78, 0.64, 1.32, 0.48, 0.35, 1.12, 0.87, 0.21, 0.57, 0.92, 0.…
$ LightActiveDistance <dbl> 6.06, 4.71, 3.91, 2.83, 5.04, 2.51, 4.71, 5.03, 4.24, 4.65, 2.24, 5.36, 3.28, 3.94, 5.54, 5.…
$ SedentaryActiveDistance <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
$ VeryActiveMinutes <dbl> 25, 21, 30, 29, 36, 38, 42, 50, 28, 19, 66, 41, 39, 73, 31, 78, 48, 16, 52, 33, 41, 50, 36, …
$ FairlyActiveMinutes <dbl> 13, 19, 11, 34, 10, 20, 16, 31, 12, 8, 27, 21, 5, 14, 23, 11, 28, 12, 34, 35, 15, 24, 22, 24…
$ LightlyActiveMinutes <dbl> 328, 217, 181, 209, 221, 164, 233, 264, 205, 211, 130, 262, 238, 216, 279, 243, 189, 243, 21…
$ SedentaryMinutes <dbl> 728, 776, 1218, 726, 773, 539, 1149, 775, 818, 838, 1217, 732, 709, 814, 833, 1108, 782, 815…
$ Calories <dbl> 1985, 1797, 1776, 1745, 1863, 1728, 1921, 2035, 1786, 1775, 1827, 1949, 1788, 2013, 1970, 21…
# Check missing values and duplicates
cat(
"\n",
"Missing values:",
sum(is.na(daily_activity)),
"\n",
"Duplicate values:",
sum(duplicated(daily_activity)),
"\n",
"Unique Ids:",
n_distinct(daily_activity$Id)
)
Missing values: 0
Duplicate values: 0
Unique Ids: 33
Let us clean: - Change column names to lower case because R is case sensitive - Change “Id” from double to a character because the number represents a category - Change “ActivityDate” from char to date
# Clean daily_activity data set
daily_activity <-
# Clean column names
clean_names(daily_activity) %>%
# Correct column types
mutate(id = as.character(id)) %>% # from double to chr
mutate(activity_date = as.Date(activity_date,
format = "%m/%d/%Y")) %>% # from chr to date
# Remove duplicate rows
distinct()
# Check daily_activity data set after cleaning
glimpse(daily_activity)
Rows: 940
Columns: 15
$ id <chr> "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "15039…
$ activity_date <date> 2016-04-12, 2016-04-13, 2016-04-14, 2016-04-15, 2016-04-16, 2016-04-17, 2016-04-18, 2016-…
$ total_steps <dbl> 13162, 10735, 10460, 9762, 12669, 9705, 13019, 15506, 10544, 9819, 12764, 14371, 10039, 15…
$ total_distance <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.88, 6.68, 6.34, 8.13, 9.04, 6.41, 9.80, 8.79, …
$ tracker_distance <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.88, 6.68, 6.34, 8.13, 9.04, 6.41, 9.80, 8.79, …
$ logged_activities_distance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ very_active_distance <dbl> 1.88, 1.57, 2.44, 2.14, 2.71, 3.19, 3.25, 3.53, 1.96, 1.34, 4.76, 2.81, 2.92, 5.29, 2.33, …
$ moderately_active_distance <dbl> 0.55, 0.69, 0.40, 1.26, 0.41, 0.78, 0.64, 1.32, 0.48, 0.35, 1.12, 0.87, 0.21, 0.57, 0.92, …
$ light_active_distance <dbl> 6.06, 4.71, 3.91, 2.83, 5.04, 2.51, 4.71, 5.03, 4.24, 4.65, 2.24, 5.36, 3.28, 3.94, 5.54, …
$ sedentary_active_distance <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, …
$ very_active_minutes <dbl> 25, 21, 30, 29, 36, 38, 42, 50, 28, 19, 66, 41, 39, 73, 31, 78, 48, 16, 52, 33, 41, 50, 36…
$ fairly_active_minutes <dbl> 13, 19, 11, 34, 10, 20, 16, 31, 12, 8, 27, 21, 5, 14, 23, 11, 28, 12, 34, 35, 15, 24, 22, …
$ lightly_active_minutes <dbl> 328, 217, 181, 209, 221, 164, 233, 264, 205, 211, 130, 262, 238, 216, 279, 243, 189, 243, …
$ sedentary_minutes <dbl> 728, 776, 1218, 726, 773, 539, 1149, 775, 818, 838, 1217, 732, 709, 814, 833, 1108, 782, 8…
$ calories <dbl> 1985, 1797, 1776, 1745, 1863, 1728, 1921, 2035, 1786, 1775, 1827, 1949, 1788, 2013, 1970, …
# Check missing values and duplicates after cleaning
cat("\n",
"Missing values:",
sum(is.na(daily_activity)),
"\n",
"Duplicate values:",
sum(duplicated(daily_activity)))
Missing values: 0
Duplicate values: 0
# Let us print summary statistic to have a better idea of the data set
daily_activity %>%
summary()
id activity_date total_steps total_distance tracker_distance logged_activities_distance
Length:940 Min. :2016-04-12 Min. : 0 Min. : 0.000 Min. : 0.000 Min. :0.0000
Class :character 1st Qu.:2016-04-19 1st Qu.: 3790 1st Qu.: 2.620 1st Qu.: 2.620 1st Qu.:0.0000
Mode :character Median :2016-04-26 Median : 7406 Median : 5.245 Median : 5.245 Median :0.0000
Mean :2016-04-26 Mean : 7638 Mean : 5.490 Mean : 5.475 Mean :0.1082
3rd Qu.:2016-05-04 3rd Qu.:10727 3rd Qu.: 7.713 3rd Qu.: 7.710 3rd Qu.:0.0000
Max. :2016-05-12 Max. :36019 Max. :28.030 Max. :28.030 Max. :4.9421
very_active_distance moderately_active_distance light_active_distance sedentary_active_distance very_active_minutes
Min. : 0.000 Min. :0.0000 Min. : 0.000 Min. :0.000000 Min. : 0.00
1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 1.945 1st Qu.:0.000000 1st Qu.: 0.00
Median : 0.210 Median :0.2400 Median : 3.365 Median :0.000000 Median : 4.00
Mean : 1.503 Mean :0.5675 Mean : 3.341 Mean :0.001606 Mean : 21.16
3rd Qu.: 2.053 3rd Qu.:0.8000 3rd Qu.: 4.782 3rd Qu.:0.000000 3rd Qu.: 32.00
Max. :21.920 Max. :6.4800 Max. :10.710 Max. :0.110000 Max. :210.00
fairly_active_minutes lightly_active_minutes sedentary_minutes calories
Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0
1st Qu.: 0.00 1st Qu.:127.0 1st Qu.: 729.8 1st Qu.:1828
Median : 6.00 Median :199.0 Median :1057.5 Median :2134
Mean : 13.56 Mean :192.8 Mean : 991.2 Mean :2304
3rd Qu.: 19.00 3rd Qu.:264.0 3rd Qu.:1229.5 3rd Qu.:2793
Max. :143.00 Max. :518.0 Max. :1440.0 Max. :4900
This summary helps us explore quickly each attribute. We notice that some attributes have minimum value of zero (total_step, total_distance, calories). Let us explore this observation.
# Check where total_steps is zero
filter(daily_activity, total_steps == 0)
We found 77 observations where total_steps is zero. We should delete these observations so that they do not affect our the mean and median. If total_step is zero that means that the person did not wear the Fitbit.
# Check where calories is zero
filter(daily_activity, calories == 0)
# Check where total_distance is zero
filter(daily_activity, total_distance == 0)
From our inspection above, we can see that we just need to delete the entries where total_steps is zero and will take take care of the rest.
daily_activity_clean <-
filter(daily_activity,
total_steps != 0,
total_distance != 0,
calories != 0)
daily_activity_clean
NA
names(daily_activity)
[1] "id" "activity_date" "total_steps" "total_distance"
[5] "tracker_distance" "logged_activities_distance" "very_active_distance" "moderately_active_distance"
[9] "light_active_distance" "sedentary_active_distance" "very_active_minutes" "fairly_active_minutes"
[13] "lightly_active_minutes" "sedentary_minutes" "calories"
# Check the attributes again
cat("Before deleting the entries\n\n")
Before deleting the entries
select(daily_activity,total_steps,total_distance,calories) %>%
summary()
total_steps total_distance calories
Min. : 0 Min. : 0.000 Min. : 0
1st Qu.: 3790 1st Qu.: 2.620 1st Qu.:1828
Median : 7406 Median : 5.245 Median :2134
Mean : 7638 Mean : 5.490 Mean :2304
3rd Qu.:10727 3rd Qu.: 7.713 3rd Qu.:2793
Max. :36019 Max. :28.030 Max. :4900
cat("\n\n\n",
"\t\t vs",
"\n\n\n")
vs
cat("After deleting the entries\n\n")
After deleting the entries
select(daily_activity_clean, total_steps, total_distance, calories) %>%
summary()
total_steps total_distance calories
Min. : 8 Min. : 0.010 Min. : 52
1st Qu.: 4927 1st Qu.: 3.373 1st Qu.:1857
Median : 8054 Median : 5.590 Median :2220
Mean : 8329 Mean : 5.986 Mean :2362
3rd Qu.:11096 3rd Qu.: 7.905 3rd Qu.:2832
Max. :36019 Max. :28.030 Max. :4900
We can see that the observation we removed affected our mean and median.
# Check daily_sleep data set before cleaning
glimpse(daily_sleep)
Rows: 413
Columns: 5
$ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 15…
$ SleepDay <chr> "4/12/2016 12:00:00 AM", "4/13/2016 12:00:00 AM", "4/15/2016 12:00:00 AM", "4/16/2016 12:00:00 AM"…
$ TotalSleepRecords <dbl> 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,…
$ TotalMinutesAsleep <dbl> 327, 384, 412, 340, 700, 304, 360, 325, 361, 430, 277, 245, 366, 341, 404, 369, 277, 273, 247, 334…
$ TotalTimeInBed <dbl> 346, 407, 442, 367, 712, 320, 377, 364, 384, 449, 323, 274, 393, 354, 425, 396, 309, 296, 264, 367…
# Check missing values and duplicates
cat("\n",
"Missing values:",
sum(is.na(daily_sleep)),
"\n",
"Duplicate values:",
sum(duplicated(daily_sleep)),
"\n",
"Unique Ids:",
n_distinct(daily_sleep$Id)
)
Missing values: 0
Duplicate values: 3
Unique Ids: 24
Let us clean:
# Clean daily_sleep data set
daily_sleep_clean <-
# Clean column names
clean_names(daily_sleep) %>%
# Correct column types
mutate(id = as.character(id)) %>% # from double to chr
mutate(sleep_day = as.Date(sleep_day,
format = "%m/%d/%Y")) %>% # from chr to date
# Remove duplicate rows
distinct()
# Check clean daily_sleep data set
glimpse(daily_sleep_clean)
Rows: 410
Columns: 5
$ id <chr> "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "1503960366"…
$ sleep_day <date> 2016-04-12, 2016-04-13, 2016-04-15, 2016-04-16, 2016-04-17, 2016-04-19, 2016-04-20, 2016-04-21,…
$ total_sleep_records <dbl> 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ total_minutes_asleep <dbl> 327, 384, 412, 340, 700, 304, 360, 325, 361, 430, 277, 245, 366, 341, 404, 369, 277, 273, 247, 3…
$ total_time_in_bed <dbl> 346, 407, 442, 367, 712, 320, 377, 364, 384, 449, 323, 274, 393, 354, 425, 396, 309, 296, 264, 3…
# Check missing values and duplicates after cleaning
cat("\n",
"Missing values:",
sum(is.na(daily_sleep_clean)),
"\n",
"Duplicate values:",
sum(duplicated(daily_sleep_clean)))
Missing values: 0
Duplicate values: 0
# Check hourly_calories data set before cleaning
glimpse(hourly_calories)
Rows: 22,099
Columns: 3
$ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 15039603…
$ ActivityHour <chr> "4/12/2016 12:00:00 AM", "4/12/2016 1:00:00 AM", "4/12/2016 2:00:00 AM", "4/12/2016 3:00:00 AM", "4/12/2…
$ Calories <dbl> 81, 61, 59, 47, 48, 48, 48, 47, 68, 141, 99, 76, 73, 66, 110, 151, 76, 83, 124, 104, 132, 100, 65, 81, 6…
# Check missing values and duplicates
cat("\n",
"Missing values:",
sum(is.na(hourly_calories)),
"\n",
"Duplicate values:",
sum(duplicated(hourly_calories)))
Missing values: 0
Duplicate values: 0
# Check hourly_intensities data set before cleaning
glimpse(hourly_intensities)
Rows: 22,099
Columns: 4
$ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503…
$ ActivityHour <chr> "4/12/2016 12:00:00 AM", "4/12/2016 1:00:00 AM", "4/12/2016 2:00:00 AM", "4/12/2016 3:00:00 AM", "4/…
$ TotalIntensity <dbl> 20, 8, 7, 0, 0, 0, 0, 0, 13, 30, 29, 12, 11, 6, 36, 58, 13, 16, 29, 39, 41, 31, 9, 21, 14, 0, 0, 4, …
$ AverageIntensity <dbl> 0.333333, 0.133333, 0.116667, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.216667, 0.500000, …
# Check missing values and duplicates
cat("\n",
"Missing values:",
sum(is.na(hourly_intensities)),
"\n",
"Duplicate values:",
sum(duplicated(hourly_intensities)))
Missing values: 0
Duplicate values: 0
# Check hourly_steps data set before cleaning
glimpse(hourly_steps)
Rows: 22,099
Columns: 3
$ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 15039603…
$ ActivityHour <chr> "4/12/2016 12:00:00 AM", "4/12/2016 1:00:00 AM", "4/12/2016 2:00:00 AM", "4/12/2016 3:00:00 AM", "4/12/2…
$ StepTotal <dbl> 373, 160, 151, 0, 0, 0, 0, 0, 250, 1864, 676, 360, 253, 221, 1166, 2063, 344, 489, 1386, 558, 1733, 684,…
# Check missing values and duplicates
cat("\n",
"Missing values:",
sum(is.na(hourly_steps)),
"\n",
"Duplicate values:",
sum(duplicated(hourly_steps)))
Missing values: 0
Duplicate values: 0
These data sets shared the same Id and Activity_hour, let us join them into a new data set (hourly_activity) before we clean them.
# Join the hourly data sets (hourly_calories, hourly_intensities, and hourly_steps)
hourly_activity <-
inner_join(hourly_calories,
hourly_intensities,
by = c("Id", "ActivityHour"))
hourly_activity <-
inner_join(hourly_activity, hourly_steps, by = c("Id", "ActivityHour"))
# Check hourly_activity data set before cleaning
glimpse(hourly_activity)
Rows: 22,099
Columns: 6
$ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503…
$ ActivityHour <chr> "4/12/2016 12:00:00 AM", "4/12/2016 1:00:00 AM", "4/12/2016 2:00:00 AM", "4/12/2016 3:00:00 AM", "4/…
$ Calories <dbl> 81, 61, 59, 47, 48, 48, 48, 47, 68, 141, 99, 76, 73, 66, 110, 151, 76, 83, 124, 104, 132, 100, 65, 8…
$ TotalIntensity <dbl> 20, 8, 7, 0, 0, 0, 0, 0, 13, 30, 29, 12, 11, 6, 36, 58, 13, 16, 29, 39, 41, 31, 9, 21, 14, 0, 0, 4, …
$ AverageIntensity <dbl> 0.333333, 0.133333, 0.116667, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.216667, 0.500000, …
$ StepTotal <dbl> 373, 160, 151, 0, 0, 0, 0, 0, 250, 1864, 676, 360, 253, 221, 1166, 2063, 344, 489, 1386, 558, 1733, …
# Check missing values and duplicates
cat("\n",
"Missing values:",
sum(is.na(hourly_activity)),
"\n",
"Duplicate values:",
sum(duplicated(hourly_activity)))
Missing values: 0
Duplicate values: 0
Let us clean:
Note:The default timezone is UTC.
# Clean hourly_activity data set
hourly_activity_clean <-
# Clean column names
clean_names(hourly_activity) %>%
# Correct column types
mutate(id = as.character(id)) %>% # from double to chr
mutate(activity_hour = as_datetime(activity_hour,
format = "%m/%d/%Y %I:%M:%S %p")) %>% # from chr to datetime
# Remove duplicate rows
distinct()
# Check clean daily_activity data set
glimpse(hourly_activity_clean)
Rows: 22,099
Columns: 6
$ id <chr> "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "…
$ activity_hour <dttm> 2016-04-12 00:00:00, 2016-04-12 01:00:00, 2016-04-12 02:00:00, 2016-04-12 03:00:00, 2016-04-12 04:…
$ calories <dbl> 81, 61, 59, 47, 48, 48, 48, 47, 68, 141, 99, 76, 73, 66, 110, 151, 76, 83, 124, 104, 132, 100, 65, …
$ total_intensity <dbl> 20, 8, 7, 0, 0, 0, 0, 0, 13, 30, 29, 12, 11, 6, 36, 58, 13, 16, 29, 39, 41, 31, 9, 21, 14, 0, 0, 4,…
$ average_intensity <dbl> 0.333333, 0.133333, 0.116667, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.216667, 0.500000,…
$ step_total <dbl> 373, 160, 151, 0, 0, 0, 0, 0, 250, 1864, 676, 360, 253, 221, 1166, 2063, 344, 489, 1386, 558, 1733,…
# Check missing values and duplicates after cleaning
cat("\n",
"Missing values:",
sum(is.na(hourly_activity_clean)),
"\n",
"Duplicate values:",
sum(duplicated(hourly_activity_clean)))
Missing values: 0
Duplicate values: 0
# as_datetime() converts with default timezone = "UTC"
# Check minute_sleep data set before cleaning
glimpse(minute_sleep)
Rows: 188,521
Columns: 4
$ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 1503960366, 150…
$ date <chr> "4/12/2016 2:47:30 AM", "4/12/2016 2:48:30 AM", "4/12/2016 2:49:30 AM", "4/12/2016 2:50:30 AM", "4/12/2016 2:51…
$ value <dbl> 3, 2, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ logId <dbl> 11380564589, 11380564589, 11380564589, 11380564589, 11380564589, 11380564589, 11380564589, 11380564589, 1138056…
# Check missing values and duplicates
cat("\n",
"Missing values:",
sum(is.na(minute_sleep)),
"\n",
"Duplicate values:",
sum(duplicated(minute_sleep)),
"\n",
"Unique Ids:",
n_distinct(minute_sleep$Id))
Missing values: 0
Duplicate values: 543
Unique Ids: 24
Let us clean:
# Clean minute_sleep data set
minute_sleep_clean <-
# Clean column names
clean_names(minute_sleep) %>%
# Correct column types
mutate(value = as.factor(value)) %>% # from double to chr
mutate(id = as.character(id)) %>% # from double to chr
mutate(date = as_datetime(date,
format = "%m/%d/%Y %I:%M:%S %p")) %>% # From chr to datetime
# Remove duplicate rows
distinct()
# Check clean daily_activity data set
glimpse(minute_sleep_clean)
Rows: 187,978
Columns: 4
$ id <chr> "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "1503960366", "1503960366"…
$ date <dttm> 2016-04-12 02:47:30, 2016-04-12 02:48:30, 2016-04-12 02:49:30, 2016-04-12 02:50:30, 2016-04-12 02:51:30, 2016…
$ value <fct> 3, 2, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ log_id <dbl> 11380564589, 11380564589, 11380564589, 11380564589, 11380564589, 11380564589, 11380564589, 11380564589, 113805…
# Check missing values and duplicates after cleaning
cat("\n",
"Missing values:",
sum(is.na(minute_sleep_clean)),
"\n",
"Duplicate values:",
sum(duplicated(minute_sleep_clean)))
Missing values: 0
Duplicate values: 0
# Check seconds_heartrate set before cleaning
glimpse(seconds_heartrate)
Rows: 2,483,658
Columns: 3
$ Id <dbl> 2022484408, 2022484408, 2022484408, 2022484408, 2022484408, 2022484408, 2022484408, 2022484408, 2022484408, 202…
$ Time <chr> "4/12/2016 7:21:00 AM", "4/12/2016 7:21:05 AM", "4/12/2016 7:21:10 AM", "4/12/2016 7:21:20 AM", "4/12/2016 7:21…
$ Value <dbl> 97, 102, 105, 103, 101, 95, 91, 93, 94, 93, 92, 89, 83, 61, 60, 61, 61, 57, 54, 55, 58, 60, 59, 57, 56, 58, 57,…
# Check missing values and duplicates
cat(
"\n",
"Missing values:", sum(is.na(seconds_heartrate)),
"\n",
"Duplicate values:", sum(duplicated(seconds_heartrate))
)
Missing values: 0
Duplicate values: 0
Let us clean:
# Clean seconds_heartrate data set
seconds_heartrate_clean <-
# Clean column names
clean_names(seconds_heartrate) %>%
# Correct column types
mutate(id = as.character(id)) %>% # from double to chr
mutate(time = as_datetime(time,
format = "%m/%d/%Y %I:%M:%S %p")) %>% # from chr to datetime
# Rename columns
rename(date_time = time,
heart_rate = value) %>%
# Remove duplicate rows
distinct()
# Check clean daily_activity data set
glimpse(seconds_heartrate_clean)
Rows: 2,483,658
Columns: 3
$ id <chr> "2022484408", "2022484408", "2022484408", "2022484408", "2022484408", "2022484408", "2022484408", "2022484…
$ date_time <dttm> 2016-04-12 07:21:00, 2016-04-12 07:21:05, 2016-04-12 07:21:10, 2016-04-12 07:21:20, 2016-04-12 07:21:25, …
$ heart_rate <dbl> 97, 102, 105, 103, 101, 95, 91, 93, 94, 93, 92, 89, 83, 61, 60, 61, 61, 57, 54, 55, 58, 60, 59, 57, 56, 58…
# Check missing values and duplicates after cleaning
cat("\n",
"Missing values:",
sum(is.na(seconds_heartrate_clean)),
"\n",
"Duplicate values:",
sum(duplicated(seconds_heartrate_clean)))
Missing values: 0
Duplicate values: 0
# as_datetime() converts with default timezone = "UTC"
# Check weight_logs set before cleaning
glimpse(weight_logs)
Rows: 67
Columns: 8
$ Id <dbl> 1503960366, 1503960366, 1927972279, 2873212765, 2873212765, 4319703577, 4319703577, 4558609924, 455860…
$ Date <chr> "5/2/2016 11:59:59 PM", "5/3/2016 11:59:59 PM", "4/13/2016 1:08:52 AM", "4/21/2016 11:59:59 PM", "5/12…
$ WeightKg <dbl> 52.6, 52.6, 133.5, 56.7, 57.3, 72.4, 72.3, 69.7, 70.3, 69.9, 69.2, 69.1, 90.7, 62.5, 62.1, 61.7, 61.5,…
$ WeightPounds <dbl> 115.9631, 115.9631, 294.3171, 125.0021, 126.3249, 159.6147, 159.3942, 153.6622, 154.9850, 154.1031, 15…
$ Fat <dbl> 22, NA, NA, NA, NA, 25, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ BMI <dbl> 22.65, 22.65, 47.54, 21.45, 21.69, 27.45, 27.38, 27.25, 27.46, 27.32, 27.04, 27.00, 28.00, 24.39, 24.2…
$ IsManualReport <lgl> TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE…
$ LogId <dbl> 1.462234e+12, 1.462320e+12, 1.460510e+12, 1.461283e+12, 1.463098e+12, 1.460938e+12, 1.462406e+12, 1.46…
# Check missing values and duplicates
cat("\n",
"Missing values:",
sum(is.na(weight_logs)),
"\n",
"Duplicate values:",
sum(duplicated(weight_logs)))
Missing values: 65
Duplicate values: 0
Let us clean: - Change column names to lower case because R is case sensitive - Change “Id” from double to a character because the number represents a category - Change “Date” from char to datetime and rename it date_time - Change NA to 0 in the column “fat”
# Clean weight_logs data set
weight_logs_clean <-
# Clean column names
clean_names(weight_logs) %>%
# Correct column types
mutate(id = as.character(id)) %>% # from double to chr
mutate(date = as_datetime(date,
format = "%m/%d/%Y %I:%M:%S %p")) %>% # from chr to datetime
# Rename columns
rename(date_time = date) %>%
# Remove duplicate rows
distinct()
# Change NA to 0 in the column "fat"
weight_logs_clean$fat[is.na(weight_logs$fat)] <- 0
Warning: Unknown or uninitialised column: `fat`.
# Check clean daily_activity data set
glimpse(weight_logs_clean)
Rows: 67
Columns: 8
$ id <chr> "1503960366", "1503960366", "1927972279", "2873212765", "2873212765", "4319703577", "4319703577", "4…
$ date_time <dttm> 2016-05-02 23:59:59, 2016-05-03 23:59:59, 2016-04-13 01:08:52, 2016-04-21 23:59:59, 2016-05-12 23:5…
$ weight_kg <dbl> 52.6, 52.6, 133.5, 56.7, 57.3, 72.4, 72.3, 69.7, 70.3, 69.9, 69.2, 69.1, 90.7, 62.5, 62.1, 61.7, 61.…
$ weight_pounds <dbl> 115.9631, 115.9631, 294.3171, 125.0021, 126.3249, 159.6147, 159.3942, 153.6622, 154.9850, 154.1031, …
$ fat <dbl> 22, NA, NA, NA, NA, 25, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ bmi <dbl> 22.65, 22.65, 47.54, 21.45, 21.69, 27.45, 27.38, 27.25, 27.46, 27.32, 27.04, 27.00, 28.00, 24.39, 24…
$ is_manual_report <lgl> TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TR…
$ log_id <dbl> 1.462234e+12, 1.462320e+12, 1.460510e+12, 1.461283e+12, 1.463098e+12, 1.460938e+12, 1.462406e+12, 1.…
# Check missing values and duplicates after cleaning
cat("\n",
"Missing values:",
sum(is.na(weight_logs_clean)),
"\n",
"Duplicate values:",
sum(duplicated(weight_logs_clean)))
Missing values: 65
Duplicate values: 0
# To uncomment the following code, select all the lines and press shift + control + c on Mac
# write.csv(daily_activity_clean,
# "daily_activity_clean.csv",
# row.names = FALSE)
#
# write.csv(daily_sleep_clean,
# "daily_sleep_clean.csv",
# row.names = FALSE)
#
# write.csv(daily_sleep_clean,
# "hourly_activity_clean.csv",
# row.names = FALSE)
#
# write.csv(minute_sleep_clean,
# "minute_sleep_clean.csv",
# row.names = FALSE)
#
# write.csv(seconds_heartrate_clean,
# "seconds_heartrate_clean.csv",
# row.names = FALSE)
#
# write.csv(weight_logs_clean ,
# "weight_logs_clean .csv",
# row.names = FALSE)
str(daily_activity_clean)
tibble [862 × 15] (S3: tbl_df/tbl/data.frame)
$ id : chr [1:862] "1503960366" "1503960366" "1503960366" "1503960366" ...
$ activity_date : Date[1:862], format: "2016-04-12" "2016-04-13" "2016-04-14" "2016-04-15" ...
$ total_steps : num [1:862] 13162 10735 10460 9762 12669 ...
$ total_distance : num [1:862] 8.5 6.97 6.74 6.28 8.16 ...
$ tracker_distance : num [1:862] 8.5 6.97 6.74 6.28 8.16 ...
$ logged_activities_distance: num [1:862] 0 0 0 0 0 0 0 0 0 0 ...
$ very_active_distance : num [1:862] 1.88 1.57 2.44 2.14 2.71 ...
$ moderately_active_distance: num [1:862] 0.55 0.69 0.4 1.26 0.41 ...
$ light_active_distance : num [1:862] 6.06 4.71 3.91 2.83 5.04 ...
$ sedentary_active_distance : num [1:862] 0 0 0 0 0 0 0 0 0 0 ...
$ very_active_minutes : num [1:862] 25 21 30 29 36 38 42 50 28 19 ...
$ fairly_active_minutes : num [1:862] 13 19 11 34 10 20 16 31 12 8 ...
$ lightly_active_minutes : num [1:862] 328 217 181 209 221 164 233 264 205 211 ...
$ sedentary_minutes : num [1:862] 728 776 1218 726 773 ...
$ calories : num [1:862] 1985 1797 1776 1745 1863 ...
# Subset numeric columns
num_df <- select_if(daily_activity_clean, is.numeric)
# Identify numeric columns
colnames(num_df)
[1] "total_steps" "total_distance" "tracker_distance" "logged_activities_distance"
[5] "very_active_distance" "moderately_active_distance" "light_active_distance" "sedentary_active_distance"
[9] "very_active_minutes" "fairly_active_minutes" "lightly_active_minutes" "sedentary_minutes"
[13] "calories"
# plotting all numerical variables
col_names <- colnames(num_df)
for (i in col_names) {
suppressWarnings(print(
ggplot(num_df, aes(num_df[[i]])) +
geom_histogram(
bins = 30,
color = "black",
fill = "gray",
aes(y = ..density..)
) +
geom_density(
color = "blue",
size = 1
) +
xlab(i) + ylab("Count") +
ggtitle(paste("Histogram and Density Plot of", i))
))
}
NA
NA
NA
NA
Observations:
Many variables show a right-skewed distribution: a larger number of data values are located on the left side of the curve
The variables total_steps, total_distance, tracker_distance have a similar distribution. We can explore their correlations later
Since the distributions are not normal. The median is a better indicator of central tendency for the numerical variables in these data set
The variable logged_activities_distance and sedentary_active_distance might not provide useful information since most of the data points are zero. It seems that the users are not logging the distance frequently
The following variables seem related. We will explore them further in the bivariate analysis section:
sedentary_minutes; sedentary_active_distance lightly_active_minutes;
light_active_distance
fairly_active_minutes; moderately_active_distance very_active_minutes;
very_active_distance
# Subset numeric columns
select_if(daily_activity_clean, negate(is.numeric))
NA
# Check counts by id
ggplot(data=daily_activity_clean) +
geom_bar(mapping = aes (x= reorder(id, id,length)))+
xlab("id") +
coord_flip()
#https://stackoverflow.com/a/9231857/15333580
#reorder(id, id, length) takes the id variable, uses itself to determine the order, and uses the length() function to calculate the values used for ordering. Essentially, this reorders the levels of the id variable based on the length of their names.
count_max_ratio <- daily_activity_clean %>%
count(id) %>%
rename(id = "id", count = "n") %>%
mutate(percent_of_max = count / max(count) * 100) %>%
arrange(desc(percent_of_max))
# Create bar graph with percentage of entries compared to maximum
ggplot(count_max_ratio, aes(x = reorder(id, percent_of_max), y = percent_of_max)) +
geom_bar(stat = "identity") +
xlab("ID") +
ylab("Percentage of Maximum Count") +
ggtitle("Count by ID and Percentage of Maximum Count") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_hline(yintercept=50, color="orange", linewidth=1)+
geom_hline(yintercept=75, color="red", linewidth=1)+
coord_flip()
NA
NA
# percent_of_max > 75%
percent_of_max_top_75 <- filter(count_max_ratio, percent_of_max >=75)
percent_of_max_top_75
# percent_of_max < 75
percent_of_max_under_75 <- filter(count_max_ratio, percent_of_max < 75)
percent_of_max_under_75
daily_activity_clean$activity_date %>% summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
"2016-04-12" "2016-04-18" "2016-04-26" "2016-04-26" "2016-05-03" "2016-05-12"
ggplot(data=daily_activity_clean , aes(x = activity_date)) +
geom_histogram(binwidth = 1, color = "black", fill = "lightblue") +
labs(x = "Activity Date", y = "Frequency", title = "Distribution of Activity Date")
Observations:
# Investigate if the missing activity data coincides with the absence of entries for certain user IDs.
ggplot(data=subset(daily_activity_clean, id %in% percent_of_max_top_75$id), aes(x = activity_date)) +
geom_histogram(binwidth = 1, color = "black", fill = "lightblue") +
labs(x = "Activity Date", y = "Frequency", title = "Distribution of Activity Date For IDs with Above 75% of Entries")
ggplot(data=subset(daily_activity_clean, id %in% percent_of_max_under_75$id), aes(x = activity_date)) +
geom_histogram(binwidth = 1, color = "black", fill = "lightblue") +
labs(x = "Activity Date", y = "Frequency", title = "Distribution of Activity Date For IDs with under 75% of Entries")
corr <- cor(select_if(daily_activity_clean, is.numeric))
ggcorrplot(corr,
hc.order = TRUE,
type = "lower",
lab = TRUE,
colors = c("firebrick", "white", "royalblue"),
lab_size = 4,
lab_col = "black",
title = "Correlation Between Numerical Variables")
#https://rdrr.io/github/microresearcher/MicroVis/man/ggcorrplot.html
sedentary_minutes; sedentary_active_distance lightly_active_minutes;
light_active_distance
fairly_active_minutes; moderately_active_distance very_active_minutes;
very_active_distance
# Compute correlation matrix
corr_matrix <- corr
# Set the threshold for correlation
threshold <- 0.60
# Find pairs of highly correlated variables
high_cor_pairs <- which(abs(corr_matrix) > threshold & lower.tri(corr_matrix, diag = FALSE), arr.ind = TRUE)
# Extract the variable names and correlation coefficients for the correlated pairs
variable_names <- colnames(corr_matrix)
cor_values <- as.vector(corr_matrix[high_cor_pairs])
# Create a data frame to store the correlated pairs and their correlation coefficients
cor_data <- data.frame(
Variable1 = variable_names[high_cor_pairs[, 1]],
Variable2 = variable_names[high_cor_pairs[, 2]],
Correlation = cor_values
)
# Sort the correlated pairs by correlation coefficient in descending order
sorted_cor_data <- cor_data[order(-cor_data$Correlation), ]
# Remove the index
row.names(sorted_cor_data) <- NULL
# Display the sorted correlated variable pairs in the dataframe
print(sorted_cor_data)
NA
Total_distance, tracker_distance, and total steps are highly correlated, so we will retain only total distance and total steps as they provide similar information.
The following minute and distance types are correlated. Which indicates that they report different aspects of the same activity, this is time or distance:
There is a moderately high correlation between the time spent during very active periods and the total number of steps/total distance:
There is a moderate correlation of 0.61 between the total duration of very active minutes and the estimated daily calories consumed.
There is a moderate correlation of 0.62 between the total distance covered and the estimated daily calories consumed.
There is a moderate correlation coefficient of 0.60 between the distance covered during light activity (light_active_distance) and the total number of steps taken (total_steps).
# Create a boxplot for total_steps
boxplot(daily_activity_clean$total_steps,
main = "Boxplot of Total Steps",
ylab = "Total Steps")
# Calculate the median and standard deviation
median_value <- median(daily_activity_clean$total_steps)
std_dev <- round(sd(daily_activity_clean$total_steps),2)
# Identify outliers
outliers <- boxplot.stats(daily_activity_clean$total_steps)$out
# Count the number of outliers
num_outliers <- length(outliers)
# Create the legend label with median, standard deviation, and outlier count
legend_label <- paste("Median:", median_value,
"\nStandard Deviation:", std_dev,
"\nOutliers:", num_outliers)
# Add the legend with median, standard deviation, and outlier count
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n", cex = 0.85)
# Steps averages by IDs
steps_df <- daily_activity_clean %>%
group_by(id) %>%
summarise(average_steps = mean(total_steps), median_steps =median(total_steps), n = n())
steps_df
# Calculate percentages for the average column
at_least_10k_avg <- sum(steps_df$average_steps >= 10000) / nrow(steps_df) * 100
between_5K_10K_avg <- sum(steps_df$average_steps >= 5000 & steps_df$average_steps < 10000) / nrow(steps_df) * 100
below_5k_avg <- sum(steps_df$average_steps < 5000) / nrow(steps_df) * 100
# Calculate percentages for the median column
at_least_10k_med <- sum(steps_df$median_steps >= 10000) / nrow(steps_df) * 100
between_5K_10K_med <- sum(steps_df$median_steps >= 5000 & steps_df$median_steps < 10000) / nrow(steps_df) * 100
below_5k_med <- sum(steps_df$median_steps < 5000) / nrow(steps_df) * 100
# Create a data frame for the steps categories
percentage_steps_df<- data.frame(
Category = c("Below 5,000", "Between 5,000 and 10,000", "At least 10,000"),
Percentage_Average = round(c(below_5k_avg, between_5K_10K_avg, at_least_10k_avg)),
Percentage_Median = round(c(below_5k_med, between_5K_10K_med, at_least_10k_med)))
percentage_steps_df
NA
# Convert Category to a factor with custom factor levels
percentage_steps_df$Category <- factor(percentage_steps_df$Category, levels = c("Below 5,000", "Between 5,000 and 10,000", "At least 10,000"))
# Create a bar plot using ggplot
ggplot(percentage_steps_df, aes(x = Category, y = Percentage_Average)) +
geom_bar(stat = "identity", fill = "blue") +
labs(x = "Average Total Steps", y = "Percentage of Users", title = "58% of Users Average 5,000-10,000 Step Daily",subtitle = "Only 21% Achieve the 10,000-Step Goal") +
geom_text(aes(label = paste0(Percentage_Average, "%")), vjust = -0.5, color = "black") +
ylim(0, 100) + theme_minimal() + theme(panel.grid = element_blank())
NA
NA
# Create a boxplot for total_distance
boxplot(daily_activity_clean$total_distance,
main = "Boxplot of Total Distance",
ylab = "Total Distance")
# Calculate the median and standard deviation
median_value <- median(daily_activity_clean$total_distance)
std_dev <- sd(daily_activity_clean$total_distance)
# Identify outliers
outliers <- boxplot.stats(daily_activity_clean$total_distance)$out
# Count the number of outliers
num_outliers <- length(outliers)
# Create the legend label with median, standard deviation, and outlier count
legend_label <- paste("Median:", round(median_value, 2),
"\nStandard Deviation:", round(std_dev, 2),
"\nOutliers:", num_outliers)
# Add the legend with median, standard deviation, and outlier count
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n")
# Total distance by IDs
t_distance_df <- daily_activity_clean %>%
group_by(id) %>%
summarise(average_t_distance = mean(total_distance ), median_t_distance =median(total_distance), n = n())
t_distance_df
# Calculate percentages for the average column
at_least_10_avg<- sum(t_distance_df$average_t_distance>= 10) / nrow(t_distance_df) * 100
between_5_10_avg <- sum(t_distance_df$average_t_distance >= 5 & t_distance_df$average_t_distance < 10) / nrow(t_distance_df) * 100
below_5_avg <- sum(t_distance_df$average_t_distance < 5) / nrow(t_distance_df) * 100
# Create a data frame for the distance categories
percentage_t_distance_df<- data.frame(
Category = c("Below 5 km", "Between 5 and 10 km", "At least 10 km"),
Percentage_Average = round(c(below_5_avg, between_5_10_avg , at_least_10_avg)))
percentage_t_distance_df
# Convert Category to a factor with custom factor levels
percentage_t_distance_df$Category <- factor(percentage_t_distance_df$Category, levels = c("Below 5 km", "Between 5 and 10 km", "At least 10 km"))
# Create a bar plot using ggplot
ggplot(percentage_t_distance_df, aes(x = Category, y = Percentage_Average)) +
geom_bar(stat = "identity", fill = "pink") +
labs(x = "Average Total Distance", y = "Percentage of Users", title = "55% of Users Average 5-10 Kilometers Daily",subtitle = "10,000 steps is approximately equal to covering 5 miles (or 8 kilometers)") +
geom_text(aes(label = paste0(Percentage_Average, "%")), vjust = -0.5, color = "black") +
ylim(0, 100) + theme_minimal() +theme(panel.grid = element_blank())
# Create a boxplot for sedentary_minutes
boxplot(daily_activity_clean$sedentary_minutes,
main = "Boxplot of Sedentary Minutes",
ylab = "Sedentary Minutes")
# Calculate the median and standard deviation
median_value <- median(daily_activity_clean$sedentary_minutes)
std_dev <- sd(daily_activity_clean$sedentary_minutes)
# Identify outliers
outliers <- boxplot.stats(daily_activity_clean$sedentary_minutes)$out
# Count the number of outliers
num_outliers <- length(outliers)
# Create the legend label with median, standard deviation, and outlier count
legend_label <- paste("Median:", round(median_value, 2),
"\nStandard Deviation:", round(std_dev, 2),
"\nOutliers:", num_outliers)
# Add the legend with median, standard deviation, and outlier count
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n", cex = 0.80)
NA
NA
# Check sedentary_minutes stats
daily_activity_clean$sedentary_minutes %>% summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 721.2 1020.5 955.2 1189.0 1440.0
outliers
[1] 2 13 0
# Count entries where sedentary minutes equal 1440
count_1440 <- sum(daily_activity_clean$sedentary_minutes == 1440)
# Output the count
count_1440
[1] 7
# Remove rows with sedentary minutes equal to the default value (1440) and outliers
daily_activity_clean <- filter(daily_activity_clean, !(sedentary_minutes %in% c(0, 2, 13, 1440)))
# Rename the column
daily_sleep_clean <- rename(daily_sleep_clean, activity_date = sleep_day)
# Join the datasets
joined_activity_sleep <- inner_join(daily_activity_clean, daily_sleep_clean, by = c("id", "activity_date"))
# Check missing values and duplicates
cat(
"\n",
"Missing values:",
sum(is.na(joined_activity_sleep )),
"\n",
"Duplicate values:",
sum(duplicated(joined_activity_sleep )),
"\n",
"Unique Ids:",
n_distinct(joined_activity_sleep $id)
)
Missing values: 0
Duplicate values: 0
Unique Ids: 24
# Create a derived column for sedentary minutes that does not include sleep time
joined_activity_sleep <- joined_activity_sleep %>%
mutate(
sedentary_min_awake = sedentary_minutes - total_minutes_asleep,
sedentary_hours_awake = sedentary_min_awake / 60,
sedentary_percentage_diff = (sedentary_minutes - sedentary_min_awake) / sedentary_minutes * 100
)
# Let us check the percentage difference of sedentary_minutes and the new column "sedentary_min_awake
# Create a boxplot for sedentary_percentage_diff
boxplot(joined_activity_sleep$sedentary_percentage_diff,
main = "Boxplot of Sedentary Percentage Difference",
ylab = "Sedentary Percentage Difference")
# Calculate the median and standard deviation
median_value <- median(joined_activity_sleep$sedentary_percentage_diff)
std_dev <- sd(joined_activity_sleep$sedentary_percentage_diff)
# Identify outliers
outliers <- boxplot.stats(joined_activity_sleep$sedentary_percentage_diff)$out
# Count the number of outliers
num_outliers <- length(outliers)
# Create the legend label with median, standard deviation, and outlier count
legend_label <- paste("Median:", round(median_value, 2),
"\nStandard Deviation:", round(std_dev, 2),
"\nOutliers:", num_outliers)
# Add the legend with median, standard deviation, and outlier count
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n", cex = 0.80)
# Create a boxplot for sedentary_min_awake
boxplot(joined_activity_sleep$sedentary_min_awake,
main = "Boxplot of Sedentary Minutes Awake",
ylab = "Sedentary Minutes Awake")
# Calculate the median and standard deviation
median_value <- median(joined_activity_sleep$sedentary_min_awake)
std_dev <- sd(joined_activity_sleep$sedentary_min_awake)
# Identify outliers
outliers <- boxplot.stats(joined_activity_sleep$sedentary_min_awake)$out
# Count the number of outliers
num_outliers <- length(outliers)
# Create the legend label with median, standard deviation, and outlier count
legend_label <- paste("Median:", round(median_value, 2),
"\nStandard Deviation:", round(std_dev, 2),
"\nOutliers:", num_outliers)
# Add the legend with median, standard deviation, and outlier count
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n", cex = 0.80)
# Count the number of cases where sedentary_minutes is smaller than total_minutes_asleep
count <- sum(joined_activity_sleep$sedentary_minutes < joined_activity_sleep$total_minutes_asleep)
# Print the count
count
[1] 42
# Subset the dataset
subset_data <- joined_activity_sleep[joined_activity_sleep$sedentary_minutes < joined_activity_sleep$total_minutes_asleep, ]
# View the subsetted data
subset_data
NA
# Check column names of the subsetted data
subset_data %>%
select(sedentary_minutes, total_minutes_asleep, sedentary_min_awake, calories,id, activity_date, total_steps, total_distance, very_active_minutes )
dim(subset_data)
[1] 42 21
dim(joined_activity_sleep)
[1] 408 21
# Use anti_join() to return a new dataset that includes all rows from the first dataset except for the rows that have a match in the second dataset.
clean_subset<- anti_join(joined_activity_sleep, subset_data)
Joining with `by = join_by(id, activity_date, total_steps, total_distance, tracker_distance, logged_activities_distance, very_active_distance, moderately_active_distance, light_active_distance, sedentary_active_distance, very_active_minutes, fairly_active_minutes, lightly_active_minutes, sedentary_minutes, calories, total_sleep_records, total_minutes_asleep, total_time_in_bed, sedentary_min_awake, sedentary_hours_awake, sedentary_percentage_diff)`
dim(clean_subset)
[1] 366 21
# Create a boxplot for sedentary_min_awake
boxplot(clean_subset$sedentary_min_awake,
main = "Boxplot of Sedentary Minutes Awake",
ylab = "Sedentary Minutes Awake")
# Calculate the median and standard deviation
median_value <- median(clean_subset$sedentary_min_awake)
std_dev <- sd(clean_subset$sedentary_min_awake)
# Identify outliers
outliers <- boxplot.stats(clean_subset$sedentary_min_awake)$out
# Count the number of outliers
num_outliers <- length(outliers)
# Create the legend label with median, standard deviation, and outlier count
legend_label <- paste("Median:", round(median_value, 2),
"\nStandard Deviation:", round(std_dev, 2),
"\nOutliers:", num_outliers)
# Add the legend with median, standard deviation, and outlier count
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n", cex = 0.80)
Observation: By eliminating negative values from “sedentary_min_awake,” the resulting values now reflect a more realistic scenario.
# Total sedentary minutes awake by IDs
t_sedentary_df <- clean_subset %>%
group_by(id) %>%
summarise(average_sedentary_min_awake = mean(sedentary_min_awake),
median_sedentary_min_awake = median(sedentary_min_awake), n = n())
t_sedentary_df
NA
dataset <- t_sedentary_df
column <- "average_sedentary_min_awake"
new_categories <- c("Below 200 minutes", "Between 200 and 400 minutes", "At least 400 minutes")
# Calculate percentages for the average column
below_200_avg <- sum(dataset[[column]] < 200) / nrow(dataset) * 100
between_200_400_avg <- sum(dataset[[column]] >= 200 & dataset[[column]] <= 400) / nrow(dataset) * 100
at_least_400_avg <- sum(dataset[[column]] >= 400) / nrow(dataset) * 100
# Create a data frame for the categories
percentage_sedentary_awake_df <- data.frame(
Category = new_categories,
Percentage_Average = round(c(below_200_avg, between_200_400_avg, at_least_400_avg))
)
# Convert Category to a factor with custom factor levels
percentage_sedentary_awake_df$Category <- factor(percentage_sedentary_awake_df$Category, levels = new_categories)
percentage_sedentary_awake_df
NA
NA
NA
# Create a bar plot using ggplot
ggplot(percentage_sedentary_awake_df, aes(x = Category, y = Percentage_Average)) +
geom_bar(stat = "identity", fill = "gray") +
labs(x = "Average Total Sedentary Min Awake", y = "Percentage of Users",
title = "48% of Users Have an Average of at Least 400 Daily Sedentary Minutes While Awake",
subtitle = "200 Minutes are 3 hours and 20 minutes; 400 min are 6 hours and 40 min") +
geom_text(aes(label = paste0(Percentage_Average, "%")), vjust = -0.5, color = "black") +
ylim(0, 100) +
theme_minimal() +
theme(panel.grid = element_blank(), plot.title = element_text(size = 12), plot.subtitle = element_text(size = 10))
In a representative sample of U.S. adults, over two-thirds spent 6 + hours/day sitting, and more than half did not meet the recommended 150 min/week of physical activity. The study discovered that prolonged sitting for 6+ hours/day was associated with higher body fat percentages. While exceeding 150 min/week of physical activity was linked to lower body fat percentages, achieving recommended activity levels may not fully offset the increased body fat from prolonged sitting.
Jingwen Liao, Min Hu, Kellie Imm, Clifton J. Holmes, Jie Zhu, Chao Cao, Lin Yang. Association of daily sitting time and leisure-time physical activity with body fat among U.S. adults. Journal of Sport and Health Science, 2022. ISSN 2095-2546. https://doi.org/10.1016/j.jshs.2022.10.001. (https://www.sciencedirect.com/science/article/pii/S2095254622001016)
# Create a boxplot for calories
boxplot(daily_activity_clean$calories,
main = "Boxplot of Calories",
ylab = "Calories")
# Calculate the median and standard deviation
median_value <- median(daily_activity_clean$calories)
std_dev <- round(sd(daily_activity_clean$calories),2)
# Identify outliers
outliers <- boxplot.stats(daily_activity_clean$calories)$out
# Count the number of outliers
num_outliers <- length(outliers)
# Create the legend label with median, standard deviation, and outlier count
legend_label <- paste("Median:", median_value,
"\nStandard Deviation:", std_dev,
"\nOutliers:", num_outliers)
# Add the legend with median, standard deviation, and outlier count
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n", cex = 0.85)
outliers
[1] 4552 4392 4501 4546 4900 4547 4398
# Calories averages by IDs
calories_df <- daily_activity_clean %>%
group_by(id) %>%
summarise(average_calories = mean(calories), median_calories = median(calories))
calories_df
NA
# Calculate percentages for the average column
below_1600_avg <- sum(calories_df$average_calories < 1600) / nrow(calories_df) * 100
between_1600_2200_avg <- sum(calories_df$average_calories >= 1600 & calories_df$average_calories < 2200) / nrow(calories_df) * 100
between_2200_3000_avg <- sum(calories_df$average_calories >= 2200 & calories_df$average_calories < 3000) / nrow(calories_df) * 100
at_least_3000_avg <- sum(calories_df$average_calories >= 3000) / nrow(calories_df) * 100
# Calculate percentages for the median column
below_1600_med <- sum(calories_df$median_calories < 1600) / nrow(calories_df) * 100
between_1600_2200_med <- sum(calories_df$median_calories >= 1600 & calories_df$median_calories < 2200) / nrow(calories_df) * 100
between_2200_3000_med <- sum(calories_df$median_calories >= 2200 & calories_df$median_calories < 3000) / nrow(calories_df) * 100
at_least_3000_med <- sum(calories_df$median_calories >= 3000) / nrow(calories_df) * 100
# Create a data frame for the calories categories
percentage_calories_df <- data.frame(
Category = c("Below 1,600", "Between 1,600 and 2,200", "Between 2,200 and 3,000", "At least 3,000"),
Percentage_Average = round(c(below_1600_avg, between_1600_2200_avg, between_2200_3000_avg, at_least_3000_avg)),
Percentage_Median = round(c(below_1600_med, between_1600_2200_med, between_2200_3000_med, at_least_3000_med))
)
# Convert Category to a factor with custom factor levels
percentage_calories_df$Category <- factor(percentage_calories_df$Category, levels = c("Below 1,600", "Between 1,600 and 2,200", "Between 2,200 and 3,000", "At least 3,000"))
percentage_calories_df
NA
# Create a bar plot using ggplot
ggplot(percentage_calories_df, aes(x = Category, y = Percentage_Average)) +
geom_bar(stat = "identity", fill = "red") +
labs(x = "Calorie Categories", y = "Percentage of Users",
title = "42% of Users Have an Average Daily Calorie Expenditure Between 1,600 and 2,200.",
subtitle = "Most females require 1,600 to 2,200 calories per day, as per the Dietary Guidelines for Americans, 2020-2025") +
geom_text(aes(label = paste0(Percentage_Average, "%")), vjust = -0.5, color = "black") +
ylim(0, 100) +
theme_minimal() +
theme(panel.grid = element_blank(),
plot.title = element_text(size = 12),
plot.subtitle = element_text(size = 10))
“Females ages 19 through 30 require about 1,800 to 2,400 calories a day. Males in this age group have higher calorie needs of about 2,400 to 3,000 a day. Calorie needs for adults ages 31 through 59 are generally lower; most females require about 1,600 to 2,200 calories a day and males require about 2,200 to 3,000 calories a day.”
U.S. Department of Agriculture and U.S. Department of Health and Human Services. Dietary Guidelines for Americans, 2020-2025. 9th Edition. December 2020. Available at DietaryGuidelines.gov/
VeryActiveMinutes: Total minutes spent in very active activity
FairlyActiveMinutes: Total minutes spent in moderate activity
LightlyActiveMinutes: Total minutes spent in light activity
SedentaryMinutes: Total minutes spent in sedentary activity
activity_minutes_df <- daily_activity_clean %>%
group_by(id) %>%
summarise(
average_very_active_minutes = mean(very_active_minutes),
average_fairly_active_minutes = mean(fairly_active_minutes),
average_lightly_active_minutes = mean(lightly_active_minutes),
average_sedentary_minutes = mean(sedentary_minutes)
)
activity_minutes_df
NA
NA
# Define the custom order of legend items
custom_order <- c( "Very Active", "Fairly Active", "Lightly Active", "Sedentary")
# Create the stacked bar plot
ggplot(activity_minutes_df, aes(y = id)) +
geom_bar(aes(x = average_sedentary_minutes, fill = "Sedentary"), stat = "identity", width = 0.5) +
geom_bar(aes(x = average_lightly_active_minutes, fill = "Lightly Active"), stat = "identity", width = 0.5) +
geom_bar(aes(x = average_fairly_active_minutes, fill = "Fairly Active"), stat = "identity", width = 0.5) +
geom_bar(aes(x = average_very_active_minutes, fill = "Very Active"), stat = "identity", width = 0.5) +
xlab("Minutes") +
ylab("ID") +
ggtitle("Average Activity Minutes by ID") +
scale_fill_manual(name = "", values = c("Very Active" = "red", "Fairly Active" = "orange", "Lightly Active" = "lightgreen", "Sedentary" = "lightblue"), breaks = custom_order) +
theme_minimal() +
theme(legend.position = "bottom", panel.grid = element_blank())
NA
NA
NA
NA
NA
NA
NA
NA
# Calculate the average for each column
averages <- colMeans(activity_minutes_df[, c("average_very_active_minutes",
"average_fairly_active_minutes",
"average_lightly_active_minutes",
"average_sedentary_minutes")])
# Calculate the total average
total_average <- sum(averages)
# Calculate the proportions
proportions <- averages / total_average
# Create the new dataframe with modified row names
overall_average_df<- data.frame(Average = averages,
Percentage = proportions * 100)
# Modify the row names
row_names <- c("Very Active", "Fairly Active", "Lightly Active", "Sedentary")
row.names(overall_average_df) <- row_names
# Print the new dataframe
overall_average_df
NA
NA
ggplot(overall_average_df, aes(x = Percentage, y = reorder(row.names(overall_average_df), Percentage), fill = row.names(overall_average_df))) +
geom_bar(stat = "identity", width = 0.7, show.legend = FALSE) +
geom_text(aes(label = paste0(round(Percentage), "%")), hjust = -0.2, color = "black", size = 4) +
ylab("Minutes Intensity") +
xlab("Percentage") +
ggtitle("Users' Overall Average Intensity Minutes Consist Primarily of Sedentary and Lightly Active Time") +
scale_fill_manual(values = c("Very Active" = "red", "Fairly Active" = "orange", "Lightly Active" = "lightgreen", "Sedentary" = "lightblue")) +
scale_x_continuous(labels = NULL) +
theme_minimal() +
theme(legend.position = "none", panel.grid = element_blank(), axis.text.y = element_text(size = 10))
NA
NA
“Analyzing each individual’s average calorie intake can provide insights into their individual dietary habits and patterns. By comparing the individual averages to the overall average, you can identify individuals who consume more or fewer calories compared to the group average. This comparison can help in understanding variations in calorie intake and potential factors influencing individual differences.”
# Define the custom order of legend items
custom_order <- c("Very Active", "Fairly Active", "Lightly Active", "Sedentary")
# Create the stacked horizontal bar chart
ggplot(overall_average_df, aes(x = Percentage, y = factor(1), fill = factor(row.names(overall_average_df), levels = custom_order))) +
geom_bar(stat = "identity", width = 0.7) +
xlab("Percentage") +
ylab("") +
ggtitle("Users' Overall Average Intensity Minutes Consist Primarily of Sedentary and Lightly Active Time") +
scale_fill_manual(
name = "",
values = c(
"Very Active" = "red",
"Fairly Active" = "orange",
"Lightly Active" = "lightgreen",
"Sedentary" = "lightblue"
),
breaks = custom_order
) +
guides(fill = guide_legend(reverse = TRUE)) + # Reverse the order of the legend
theme_minimal() +
theme(legend.position = "top",
panel.grid = element_blank(),
axis.text.y = element_blank(), # Remove the y-axis text
plot.title = element_text(size = 12, margin = margin(b = 20))) + # Adjust the title size and margin
geom_vline(xintercept = 97, color = "black", linetype = "dashed") +
annotate("text", x = 97, y = 1, label = " 97%", vjust = -5.5, hjust = 0.1)
NA
NA
These indicators provide insights into activity levels, sedentary behavior, and calorie burn. They can help track progress, set goals, and evaluate user behavior over time. Remember to consider the specific context and goals of your analysis to select and customize the most relevant KPIs for your use case. The context I will use is the guidelines for physical activity and diet for Americans:
str(daily_sleep_clean)
tibble [410 × 5] (S3: tbl_df/tbl/data.frame)
$ id : chr [1:410] "1503960366" "1503960366" "1503960366" "1503960366" ...
$ activity_date : Date[1:410], format: "2016-04-12" "2016-04-13" "2016-04-15" "2016-04-16" ...
$ total_sleep_records : num [1:410] 1 2 1 2 1 1 1 1 1 1 ...
$ total_minutes_asleep: num [1:410] 327 384 412 340 700 304 360 325 361 430 ...
$ total_time_in_bed : num [1:410] 346 407 442 367 712 320 377 364 384 449 ...
#Sanity check: Verify that the value of total_time_in_bed is greater than total_minutes_asleep, as we would expect.
daily_sleep_clean[daily_sleep_clean$total_time_in_bed < daily_sleep_clean$total_minutes_asleep,]
numerical_cols <- daily_sleep_clean%>%
select_if(is.numeric)
# plotting all numerical variables
col_names <- colnames(numerical_cols )
for (i in col_names) {
suppressWarnings(print(
ggplot(numerical_cols , aes(numerical_cols [[i]])) +
geom_histogram(
bins = 30,
color = "black",
fill = "gray",
aes(y = ..density..)
) +
geom_density(
color = "blue",
size = 1
) +
xlab(i) + ylab("Count") +
ggtitle(paste("Histogram with Density Plot of", i))
))
}
# Correlation between numerical variables
corr <- cor(select_if(daily_sleep_clean, is.numeric))
ggcorrplot(corr,
hc.order = TRUE,
type = "lower",
lab = TRUE,
colors = c("firebrick", "white", "royalblue"),
lab_size = 4,
lab_col = "black",
title = "Correlation Between Numerical Variables")
NA
NA
ggplot(data = daily_sleep_clean, aes(x = total_minutes_asleep, y = total_time_in_bed)) +
geom_point()
frequency_table <- as.data.frame(table(daily_sleep_clean$total_sleep_records))
frequency_table$Percentage <- frequency_table$Freq / sum(frequency_table$Freq) * 100
ggplot(data = frequency_table, aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = paste(Freq, " (", percent(Percentage / 100), ")", sep = "")),
hjust = 0.5, vjust = -0.4, color = "black") +
labs(x = "Total Sleep Records", y = "Frequency",
title = "Uncommon Napping: 89% of Sleep Records Indicate a Singular Sleep Period.",
subtitle = "Includes naps > 60 min.")+
theme_minimal() +
theme(panel.grid = element_blank(),
plot.title = element_text(size = 12),
plot.subtitle = element_text(size = 10, margin = margin(b = 20)))
NA
NA
NA
# Create a boxplot for total_minutes_asleep
boxplot(daily_sleep_clean$total_minutes_asleep,
main = "Boxplot of Total Minutes Asleep",
ylab = "Total Minutes Asleep")
# Calculate the median and standard deviation
median_value <- median(daily_sleep_clean$total_minutes_asleep)
std_dev <- round(sd(daily_sleep_clean$total_minutes_asleep), 2)
# Identify outliers
outliers <- boxplot.stats(daily_sleep_clean$total_minutes_asleep)$out
# Count the number of outliers
num_outliers <- length(outliers)
# Create the legend label with median, standard deviation, and outlier count
legend_label <- paste("Median:", median_value,
"\nStandard Deviation:", std_dev,
"\nOutliers:", num_outliers)
# Add the legend with median, standard deviation, and outlier count
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n", cex = 0.85)
# Sleep duration averages by IDs with standard deviation and count (n)
sleep_df <- daily_sleep_clean %>%
group_by(id) %>%
summarise(average_sleep_minutes = mean(total_minutes_asleep),
standard_deviation_sleep_minutes = sd(total_minutes_asleep),
n = n())
sleep_df
NA
NA
NA
# Drop ID "2320127002" due to insufficient data for computing mean and standard deviation.
sleep_df <- sleep_df %>%
filter(id != "2320127002")
sleep_df
# Calculate percentages for the average column
below_6_hours <- sum(sleep_df$average_sleep_minutes < 360) / nrow(sleep_df) * 100
between_6_7_hours <- sum(sleep_df$average_sleep_minutes >= 360 & sleep_df$average_sleep_minutes < 420) / nrow(sleep_df) * 100
at_least_7_hours <- sum(sleep_df$average_sleep_minutes >= 420) / nrow(sleep_df) * 100
# Create a data frame for the sleep duration categories
percentage_sleep_df <- data.frame(
Category = c("Below 6 hours", "Between 6 and 7 hours", "At least 7 hours"),
Percentage_Average = round(c(below_6_hours, between_6_7_hours, at_least_7_hours))
)
# Convert Category to a factor with custom factor levels
percentage_sleep_df$Category <- factor(percentage_sleep_df$Category, levels = c("Below 6 hours", "Between 6 and 7 hours", "At least 7 hours"))
percentage_sleep_df
str(percentage_sleep_df)
'data.frame': 3 obs. of 2 variables:
$ Category : Factor w/ 3 levels "Below 6 hours",..: 1 2 3
$ Percentage_Average: num 30 22 48
ggplot(percentage_sleep_df, aes(x = Category, y = Percentage_Average)) +
geom_bar(stat = "identity", fill = "purple") +
labs(x = "Average Sleep Duration", y = "Percentage of Users",
title = "52% of Users Get Less Than 7 Hours of Sleep on Average Daily") +
geom_text(aes(label = paste0(Percentage_Average, "%")), vjust = -0.5, color = "black") +
ylim(0, 100) +
theme_minimal() +
theme(panel.grid = element_blank(), plot.title = element_text(size = 12), plot.subtitle = element_text(size = 10))
#Error bars
# Convert average_sleep_minutes and standard_deviation_sleep_minutes to hours
sleep_df$average_sleep_hours <- sleep_df$average_sleep_minutes / 60
sleep_df$standard_deviation_sleep_hours <- sleep_df$standard_deviation_sleep_minutes / 60
# Create a bar plot for each 'id' with error bars representing standard deviation
ggplot(sleep_df, aes(x = id, y = average_sleep_hours)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
geom_errorbar(aes(ymin = average_sleep_hours - standard_deviation_sleep_minutes / 60,
ymax = average_sleep_hours + standard_deviation_sleep_minutes / 60),
width = 0.2, position = position_dodge(0.9), color = "black") +
labs(x = "ID", y = "Average Sleep Duration (hours)",
title = "Sleep Consistency: Average Sleep Duration with Error Bars",
subtitle = "Error bars represent the standard deviation around the mean.") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_hline(yintercept = 7, linetype = "dashed", color = "red") +
scale_y_continuous(breaks = seq(0, 12, 1)) # Adjust the range as needed
# Calculate sleep duration averages and standard deviations in hours
sleep_df <- daily_sleep_clean %>%
group_by(id) %>%
summarise(n = n(),
average_sleep_hours = mean(total_minutes_asleep) / 60, # Convert minutes to hours
average_time_in_bed_hours = mean(total_time_in_bed) / 60,
standard_deviation_sleep_hours = sd(total_minutes_asleep) / 60,
standard_deviation_time_in_bed_hours = sd(total_time_in_bed) / 60,
) %>%
mutate(time_difference_hours = average_time_in_bed_hours - average_sleep_hours, # Calculate the time difference in hours
average_awake_in_bed_hours = time_difference_hours, # Rename column "awake_in_bed"
sd_awake_in_bed_hours = sd(time_difference_hours)) # Calculate SD for "awake_in_bed" in hours
sleep_df
NA
NA
NA
# Drop ID "2320127002" due to insufficient data for computing mean and standard deviation.
sleep_df <- sleep_df %>%
filter(id != "2320127002")
dim(sleep_df)
[1] 23 9
create_boxplots_in_one_output <- function(data_frame, columns_to_analyze, decimal_places = 2) {
num_columns <- length(columns_to_analyze)
num_rows <- ceiling(num_columns / 2)
par(mfrow = c(num_rows, 2)) # Set the plotting layout
for (i in 1:num_columns) {
column_name <- columns_to_analyze[i]
boxplot(data_frame[[column_name]],
ylab = column_name)
median_value <- median(data_frame[[column_name]])
std_dev <- round(sd(data_frame[[column_name]]), decimal_places)
outliers <- boxplot.stats(data_frame[[column_name]])$out
num_outliers <- length(outliers)
legend_label <- paste("Median:", round(median_value, decimal_places),
"\nSD:", std_dev,
"\nOutliers:", num_outliers)
legend("topright", legend = legend_label, pch = "", col = "black", bty = "n", cex = 0.75)
}
par(mfrow = c(1, 1)) # Reset the plotting layout to default
}
# Columns to analyze
columns_to_analyze <- c("average_sleep_hours", "average_awake_in_bed_hours")
# Call the function to create boxplots in one output
create_boxplots_in_one_output(sleep_df, columns_to_analyze, decimal_places = 2)
# Columns to analyze
columns_to_analyze <- c("standard_deviation_sleep_hours", "sd_awake_in_bed_hours")
# Call the function
create_boxplots_in_one_output(sleep_df, columns_to_analyze, decimal_places = 2)
#Columns with outliers to remove
columns_with_outliers <- c("average_sleep_hours", "average_awake_in_bed_hours", "standard_deviation_sleep_hours")
# Function to remove outliers from a column
remove_outliers <- function(data, column_name) {
outlier_bounds <- boxplot.stats(data[[column_name]])$out
data_no_outliers<- data[!(data[[column_name]] %in% outlier_bounds), ]
return(data_no_outliers)
}
# Loop through each column and remove outliers
for (col in columns_with_outliers) {
sleep_df <- remove_outliers(sleep_df, col)
}
sleep_df
# Check if outliers were removed
columns_to_analyze <- c("average_sleep_hours", "average_awake_in_bed_hours", "standard_deviation_sleep_hours")
# Call the function to create boxplots in one output
create_boxplots_in_one_output(sleep_df, columns_to_analyze, decimal_places = 2)
#Let us divide the users into irregular sleepers and regular sleepers. We will use the 75th percentile as the threshold to determine irregular sleepers. The rest will be considered regular sleepers.
# Define the Threshold (e.g., using the 75th percentile)
threshold <- quantile(sleep_df$standard_deviation_sleep_hours, 0.75)
# Create a new column "sleeper_type" based on the threshold
sleep_df$sleeper_type <- ifelse(sleep_df$standard_deviation_sleep_hours > threshold, "irregular", "regular")
sleep_df
# sleep_type counts
table(sleep_df$sleeper_type)
irregular regular
4 13
sleep_df
color_options <- c("#E69F00", "#0072B2") # Blue: "#0072B2", Orange: "#E69F00"
# Function to create the violin plot for a given y-axis column
create_violin_plot <- function(data, x_axis_col, y_axis_col) {
ggplot(data, aes_string(x = x_axis_col, y = y_axis_col, fill = x_axis_col)) +
geom_violin(scale = "width", draw_quantiles = c(0.25, 0.5, 0.75), trim = FALSE) +
geom_boxplot(width = 0.1, fill = "white", color = "black") +
labs(x = "Sleeper Type", y = y_axis_col, title = paste("Comparison",x_axis_col,"for", y_axis_col)) +
scale_fill_manual(values = color_options) +
theme_minimal()
}
# Call the function to create the violin plots for each column
for (col in c("average_sleep_hours", "standard_deviation_sleep_hours", "average_awake_in_bed_hours","sd_awake_in_bed_hours")) {
plot <- create_violin_plot(sleep_df, "sleeper_type", col)
print(plot)
}
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
Please use tidy evaluation ideoms with `aes()`
Observations:
Regular sleepers tend to have higher median average sleep hours compared to irregular sleepers.This suggests that individuals classified as regular sleepers are likely getting more sleep on average than those categorized as irregular sleepers.
Additionally, the spread of the “average_sleep_hours” for irregular sleepers appears to be wider, indicating more variability in their sleep duration. In contrast, the violin plot for regular sleepers shows a narrower spread, suggesting that their sleep duration is more consistent.
Regular sleepers exhibit a slightly higher median average awake-in-bed duration compared to irregular sleepers.
Summary: Regular sleepers get more sleep on average, have a more consistent sleep duration, and slightly higher median awake-in-bed duration than irregular sleepers.
str(minute_sleep_clean)
tibble [187,978 × 4] (S3: tbl_df/tbl/data.frame)
$ id : chr [1:187978] "1503960366" "1503960366" "1503960366" "1503960366" ...
$ date : POSIXct[1:187978], format: "2016-04-12 02:47:30" "2016-04-12 02:48:30" "2016-04-12 02:49:30" "2016-04-12 02:50:30" ...
$ value : Factor w/ 3 levels "1","2","3": 3 2 1 1 1 1 1 2 2 2 ...
$ log_id: num [1:187978] 1.14e+10 1.14e+10 1.14e+10 1.14e+10 1.14e+10 ...
This data seems to come from the Classic Sleep Log (1 minute)
Value indicating the sleep state. 1 = asleep, 2 = restless, 3 = awake
For more detail check : Fitbit data dictionary
# Add labels to the velue column
minute_sleep_clean$value <- factor(minute_sleep_clean$value, levels = c("1", "2", "3"), labels = c("asleep", "restless", "awake"))
minute_sleep_clean %>% summary()
id date value log_id
Length:187978 Min. :2016-04-11 20:48:00.00 asleep :171960 Min. :1.137e+10
Class :character 1st Qu.:2016-04-19 02:48:00.00 restless: 14002 1st Qu.:1.144e+10
Mode :character Median :2016-04-26 21:48:00.00 awake : 2016 Median :1.150e+10
Mean :2016-04-26 13:31:23.11 Mean :1.150e+10
3rd Qu.:2016-05-03 23:47:00.00 3rd Qu.:1.155e+10
Max. :2016-05-12 09:56:00.00 Max. :1.162e+10
# Assuming "value" column represents total sleep records
frequency_table <- as.data.frame(table(minute_sleep_clean$value))
frequency_table$Percentage <- frequency_table$Freq / sum(frequency_table$Freq) * 100
ggplot(data = frequency_table, aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity", fill = "#008080") +
geom_text(aes(label = paste(Freq, " (", percent(Percentage / 100), ")", sep = "")),
hjust = 0.5, vjust = -0.4, color = "black") +
labs(x = "Total Minutes Records", y = "Frequency",
title = "User Sleep States: 91% of Minutes Spent Asleep with Minimal Interruptions:",
subtitle = "Restlessness: 7.4% | Awake: 1.1%") +
theme_minimal() +
theme(panel.grid = element_blank(),
plot.title = element_text(size = 12),
plot.subtitle = element_text(size = 10, margin = margin(b = 20)))
Weight Reporting Behavior: Assess the reporting behavior of users using the IsManualReport variable. Calculate the percentage of manual weight reports compared to total weight reports to determine how actively users are providing weight updates. This can indicate user engagement and motivation to track their weight.
Data Completeness: Analyze the completeness and quality of data using variables such as LogId. Assess if there are missing or erroneous data points that may impact the analysis. Exclude or handle such data points appropriately to ensure accurate insights.
BMI distribution (percentage above normal BMI)
Weight Trends: Analyze the trends in weight over time (Date) to understand if users are experiencing weight loss, gain, or stability. Plotting weight (WeightKg or WeightPounds) against time can reveal patterns, fluctuations, or significant changes in weight. You can also calculate descriptive statistics such as average weight, standard deviation, or rate of weight change to gain insights into user behavior.
https://www.cdc.gov/mmwr/volumes/68/wr/mm6823a1.htm
file:///Users/vivianbarros/Desktop/Physical_Activity_Guidelines_2nd_edition.pdf
#paper https://dl.acm.org/doi/pdf/10.1145/3339825.3394926
kaggle https://www.kaggle.com/datasets/arashnic/fitbit/discussion/313589?page=2
this is it:
https://www.cdc.gov/physicalactivity/data/inactivity-prevalence-maps/index.html#Race-Ethnicity
https://www.bls.gov/tus/data/datafiles-0321.htm
Histograms: https://statisticsbyjim.com/basics/histograms/ https://blog.minitab.com/en/3-things-a-histogram-can-tell-you
Categorical, ordinal, interval, and ratio variables : https://www.graphpad.com/guides/prism/latest/statistics/the_different_kinds_of_variabl.htm
Add density line to histogram: https://r-coder.com/density-plot-r
——————— showing notebook in github
convert to jupyter notbook option: https://medium.datadriveninvestor.com/transforming-your-rmd-to-ipynb-file-r-markdown-to-python-jupyter-b1306646f50b
Hey all,
Sorry if I’m misunderstanding here, but I have been knitting the .Rmd notebook to a .md file within RStudio, and it seems to display very well in GitHub. You can see an example in my repo to see if I’m on track with this thread.
The links below give the explanation. Short Version:
change “output=html_document” to “output=github_document”
knit the document push the .md file to GitHub instead of the .Rmd be sure to push the ’_files’ folder to include any images https://rmarkdown.rstudio.com/github_document_format.html https://gist.github.com/JoshuaTPierce/b919168421b40e06481080eb53c3fb2f